knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(dplyr)
library(broom)

nyc_demo <- read_csv("nyc_zip_demo.csv")
nyc_demo_more <- read_csv('raw_data/nyc_zip_demographics.csv')
nyc_demo <- left_join(nyc_demo, nyc_demo_more,by = c("Zipcode" = "ZIPCODE"))
coffee_zip <- read_csv("coffee_zip.csv")
df <- left_join(coffee_zip, nyc_demo) %>% 
    mutate(Women_pct = 100 - Men_pct)

visualization

library(ggthemes)
library(ggplot2)
library(tidyr)

pc1 <- ggplot(df, aes(x = Total_Income, y = Avg_rating, 
                          color = Name)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Total Income') + 
    ylab('Average Rating for Coffee Shops') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Total Income vs Average Rating for Coffee') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97")) +
    theme(plot.title = element_text(size=14, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc1
pc2 <- ggplot(df, aes(x = Mean_Income, y = Avg_rating, 
                          color = Name)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Mean Income') + 
    ylab('Average Rating for Coffee Shops') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Mean Income vs Average Rating for Coffee') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97")) +
    theme(plot.title = element_text(size=14, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc2
pc3 <- ggplot(df, aes(x = Population, y = Avg_rating, 
                          color = Name)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Population') + 
    ylab('Average Rating for Coffee Shops') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Population vs Average Rating for Coffee') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97")) +
    theme(plot.title = element_text(size=14, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc3

clean up data and do more visualization with demographics

poverty

pc6 <- ggplot(df, aes(x = Poverty_pct, y = Avg_rating, 
                          color = Name)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Poverty Percentage') + 
    ylab('Average Rating for Coffee Shops') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Poverty Percentage vs Average Rating for Coffee') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc6

unemployment

pc7 <- ggplot(df, aes(x = Unemployment_pct, y = Avg_rating, 
                          color = Name)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Unemployment Percentage') + 
    ylab('Average Rating for Coffee Shops') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Unemployment Percentage vs Average Rating for Coffee') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc7
pc8 <- ggplot(df, aes(x = Unemployment_pct, y = Poverty_pct, 
                          color = Name)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Unemployment Percentage') + 
    ylab('Poverty Percentage') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Unemployment Percentage vs Poverty Percentage') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97")) +
    theme(plot.title = element_text(size=14, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc8

linear regression

coffee_lm <- df

coffee_lm$Name[coffee_lm$Name == "Dunkin'"] <- 0
coffee_lm$Name[coffee_lm$Name == "Starbucks"] <- 1
coffee_lm$Name[coffee_lm$Name == "Blue Bottle"] <- 2

coffee_sb <- coffee_lm %>% filter(Name == 1)
coffee_dd <- coffee_lm %>% filter(Name == 0)
coffee_bb <- coffee_lm %>% filter(Name == 2)
# write_csv(coffee_sb, "./cleaned_sb_only.csv", na = "NA" )
# write_csv(coffee_dd, "./cleaned_dd_only.csv", na = "NA" )

gender for SB

df_gather <- coffee_sb %>% 
  gather(Men_pct, Women_pct, 
         key = gender, value = gender_value) 
pc_race <- ggplot(df_gather, aes(x = gender_value, y = Avg_rating,
                                 color = gender)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Gender Percentage') + 
    ylab('Average Rating for Starbucks') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Gender Percentage vs Average Rating for Starbucks') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97"
                                 )) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc_race

gender for DD

df_gather <- coffee_dd %>% 
  gather(Men_pct, Women_pct, 
         key = gender, value = gender_value) 
pc_race <- ggplot(df_gather, aes(x = gender_value, y = Avg_rating,
                                 color = gender)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Gender Percentage') + 
    ylab('Average Rating for the Dunkin') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Gender Percentage vs Average Rating for the Dunkin') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97"
                                 )) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc_race

gender for BB

df_gather <- coffee_bb %>% 
  gather(Men_pct, Women_pct, 
         key = gender, value = gender_value) 
pc_race <- ggplot(df_gather, aes(x = gender_value, y = Avg_rating,
                                 color = gender)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Gender Percentage') + 
    ylab('Average Rating for Blue Bottle') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Gender Percentage vs Average Rating for Blue Bottle') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97"
                                 )) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc_race

race pct for SB

df_gather <- coffee_sb %>% 
  gather(White_pct, Black_pct, Native_pct, Asian_pct,
         key = race, value = race_value) 
pc_race <- ggplot(df_gather, aes(x = race_value, y = Avg_rating, 
                          color = race)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Race Percentage') + 
    ylab('Average Rating for Starbucks') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Race Percentage vs Average Rating for Starbucks') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97", "#ABA9A9")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc_race

race pct for DD

df_gather <- coffee_dd %>% 
  gather(White_pct, Black_pct, Native_pct, Asian_pct,
         key = race, value = race_value) 
pc_race <- ggplot(df_gather, aes(x = race_value, y = Avg_rating, 
                          color = race)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Race Percentage') + 
    ylab('Average Rating for the Dunkin') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Race Percentage vs Average Rating for the Dunkin') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97", "#ABA9A9")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc_race

race pct for BB

df_gather <- coffee_bb %>% 
  gather(White_pct, Black_pct, Native_pct, Asian_pct,
         key = race, value = race_value) 
pc_race <- ggplot(df_gather, aes(x = race_value, y = Avg_rating, 
                          color = race)) + 
    geom_point(alpha = 0.8, radius = 0.5) +
    xlab('Race Percentage') + 
    ylab('Average Rating for Blue Bottle') + 
    labs(caption = 'Data Source: Yelp') +
    geom_smooth(method=lm, se=FALSE, linetype="dashed") +
    ggtitle('Race Percentage vs Average Rating for Blue Bottle') +
    theme_linedraw() +
    scale_color_manual(values = c("#34afd1", "#f09a56", 
                                 "#87dc97", "#ABA9A9")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc_race

Raing vs Total Income

library(jtools)
lm <- lm(Avg_rating ~ Total_Income,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Total_Income,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Total_Income,
         data = coffee_dd)
summary(lm)
summ(lm)

Raing vs Mean Income

lm <- lm(Avg_rating ~ Mean_Income,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Mean_Income,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Mean_Income,
         data = coffee_bb)
summary(lm)
summ(lm)

Raing vs Population

lm <- lm(Avg_rating ~ Population,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population,
         data = coffee_bb)
summary(lm)
summ(lm)

Rating vs Gender

SB

lm <- lm(Avg_rating ~ Women_pct,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Men_pct,
         data = coffee_sb)
summary(lm)
summ(lm)

DD

lm <- lm(Avg_rating ~ Women_pct,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Men_pct,
         data = coffee_dd)
summary(lm)
summ(lm)

BB

lm <- lm(Avg_rating ~ Women_pct,
         data = coffee_bb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Men_pct,
         data = coffee_bb)
summary(lm)
summ(lm)

Rating vs Race

SB

lm <- lm(Avg_rating ~ White_pct,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Asian_pct,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Black_pct,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Native_pct,
         data = coffee_sb)
summary(lm)
summ(lm)

DD

lm <- lm(Avg_rating ~ White_pct,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Asian_pct,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Black_pct,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Native_pct,
         data = coffee_dd)
summary(lm)
summ(lm)

BB

lm <- lm(Avg_rating ~ White_pct,
         data = coffee_bb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Asian_pct,
         data = coffee_bb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Black_pct,
         data = coffee_bb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Native_pct,
         data = coffee_bb)
summary(lm)
summ(lm)

Rating vs Poverty

lm <- lm(Avg_rating ~ Poverty_pct,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Poverty_pct,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Poverty_pct,
         data = coffee_bb)
summary(lm)
summ(lm)

Rating vs Unemployment

lm <- lm(Avg_rating ~ Unemployment_pct,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Unemployment_pct,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Unemployment_pct,
         data = coffee_bb)
summary(lm)
summ(lm)

add interaction term

lm <- lm(Avg_rating ~ Population*Mean_Income,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population*Mean_Income,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population*Mean_Income,
         data = coffee_bb)
summary(lm)
summ(lm)

More

lm <- lm(Avg_rating ~ Population + Mean_Income,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population + Mean_Income,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population + Mean_Income,
         data = coffee_bb)
summary(lm)
summ(lm)

More 2.0

lm <- lm(Avg_rating ~ Population + Mean_Income +
             Avg_review_count,
         data = coffee_sb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population + Mean_Income +
             Avg_review_count,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population + Mean_Income +
             Avg_review_count,
         data = coffee_bb)
summary(lm)
summ(lm)

More 3.0

lm <- lm(Avg_rating ~ Population*Mean_Income +
             Avg_review_count,
         data = coffee_dd)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population*Mean_Income +
             Avg_review_count,
         data = coffee_bb)
summary(lm)
summ(lm)
lm <- lm(Avg_rating ~ Population*Mean_Income +
             Avg_review_count,
         data = coffee_sb)
summary(lm)
summ(lm)

More 4.0

lm <- lm(Avg_rating ~ Population + Mean_Income + Avg_review_count + Men_pct + White_pct + Asian_pct + Black_pct + Poverty_pct + Unemployment_pct, data = coffee_sb)
summary(lm)
summ(lm)


nikkyxiong/coffee_rating_and_nyc_demographics documentation built on May 16, 2020, 9:27 a.m.